In [ ]:
import pandas as pd
pd.set_option('max_rows', 10)
dtype for reprsenting categorical, or factor, data
In [ ]:
c = pd.Categorical(['a', 'b', 'b', 'c', 'a', 'b', 'a', 'a', 'a', 'a'])
In [ ]:
c
In [ ]:
c.describe()
In [ ]:
c.codes
In [ ]:
c.categories
In [ ]:
c.as_ordered()
cat accessor
In [ ]:
dta = pd.DataFrame.from_dict({'factor': c,
'x': np.random.randn(10)})
In [ ]:
dta.head()
In [ ]:
dta.dtypes
In [ ]:
dta.factor.cat
In [ ]:
dta.factor.cat.categories
In [ ]:
dta.factor.describe()
fditemno to a Categorical Type. Use describe.
In [ ]:
# [Solution Here]
In [ ]:
%load solutions/load_nfs_categorical.py
Pandas provides conveniences for working with dates
In [ ]:
dates = pd.date_range("1/1/2015", periods=75, freq="D")
dates
In [ ]:
y = pd.Series(np.random.randn(75), index=dates)
y.head()
In [ ]:
y.reset_index().dtypes
datetime type is in a DataFrame, there is a special dt accessor
In [ ]:
dta = (y.reset_index(name='t').
rename(columns={'index': 'y'}))
In [ ]:
dta.head()
In [ ]:
dta.dtypes
In [ ]:
dta.y.dt.freq
In [ ]:
dta.y.dt.day
In [ ]:
y.ix["2015-01-01":"2015-01-15"]
DatetimeIndex supports partial string indexing
In [ ]:
y["2015-01"]
DateTeimIndexResampler object
In [ ]:
resample = y.resample("M")
In [ ]:
resample.mean()
Or go to a higher frequency, optionally specifying how to fill in the
In [ ]:
y.asfreq('H', method='ffill')
There are convenience methods to lag and lead time series
In [ ]:
y
In [ ]:
y.shift(1)
In [ ]:
y.shift(-1)
In [ ]:
ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000',
periods=1000))
ts = ts.cumsum()
In [ ]:
rolling = ts.rolling(window=60)
rolling
In [ ]:
rolling.mean()
parse_dates keyword of read_csv
In [ ]:
# [Solution here]
In [ ]:
%load solutions/load_nfs_datetime.py
In [ ]:
# this is a bit slow because of the date parsing
transit = pd.read_csv("../data/AIS/transit_segments.csv",
parse_dates=['st_time', 'end_time'],
infer_datetime_format=True)
vessels = pd.read_csv("../data/AIS/vessel_information.csv")
In [ ]:
vessels.head()
In [ ]:
transit.head()
In [ ]:
vessels.columns.intersection(transit.columns)
merge will use the common columns if we do not explicitly specify the columns
In [ ]:
transit.merge(vessels).head()
Watch out, when merging on columns, indices are discarded
In [ ]:
A = pd.DataFrame(np.random.randn(25, 2),
index=pd.date_range('1/1/2015', periods=25))
A[2] = np.repeat(list('abcde'), 5)
A
In [ ]:
B = pd.DataFrame(np.random.randn(5, 2))
B[2] = list('abcde')
B
In [ ]:
A.merge(B, on=2)
In [ ]:
transit.set_index('mmsi', inplace=True)
vessels.set_index('mmsi', inplace=True)
In [ ]:
transit.join(vessels).head()
../data/NationalFoodSurvey/NFS_1974/
In [ ]:
%load solutions/join_nfs.py
concat function for this
In [ ]:
df1 = pd.read_csv('../data/ebola/guinea_data/2014-08-04.csv',
index_col=['Date', 'Description'])
df2 = pd.read_csv('../data/ebola/guinea_data/2014-08-26.csv',
index_col=['Date', 'Description'])
In [ ]:
df1.shape
In [ ]:
df2.shape
In [ ]:
df1.head()
In [ ]:
df2.head()
In [ ]:
df1.index.is_unique
In [ ]:
df2.index.is_unique
We can concatenate on the rows
In [ ]:
df = pd.concat((df1, df2), axis=0)
df.shape
glob.glob usefulre.search useful
In [ ]:
# [Solution here]
In [ ]:
%load solutions/concat_nfs.py
cat and dt accessors we've already seenstr accessor that provides fast string operations on columns
In [ ]:
vessels.type
In [ ]:
vessels.type.str.count('/').max()
nan-padding
In [ ]:
vessels.type.str.split('/', expand=True)
"Ref_ food groups.txt"
In [ ]:
# [Solution here]
In [ ]:
%load solutions/nfs_dairy.py